1   package org.apache.lucene.sandbox.queries;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.BufferedReader;
21  import java.io.InputStream;
22  import java.io.InputStreamReader;
23  import java.nio.charset.StandardCharsets;
24  
25  import org.apache.lucene.analysis.Analyzer;
26  import org.apache.lucene.analysis.MockAnalyzer;
27  import org.apache.lucene.analysis.MockTokenizer;
28  import org.apache.lucene.document.Document;
29  import org.apache.lucene.document.Field;
30  import org.apache.lucene.index.IndexReader;
31  import org.apache.lucene.index.RandomIndexWriter;
32  import org.apache.lucene.index.Term;
33  import org.apache.lucene.search.IndexSearcher;
34  import org.apache.lucene.search.MultiTermQuery;
35  import org.apache.lucene.search.TopDocs;
36  import org.apache.lucene.search.similarities.DefaultSimilarity;
37  import org.apache.lucene.store.Directory;
38  import org.apache.lucene.util.IOUtils;
39  import org.apache.lucene.util.LuceneTestCase;
40  
41  /** 
42   * Tests the results of fuzzy against pre-recorded output 
43   * The format of the file is the following:
44   * 
45   * Header Row: # of bits: generate 2^n sequential documents 
46   * with a value of Integer.toBinaryString
47   * 
48   * Entries: an entry is a param spec line, a resultCount line, and
49   * then 'resultCount' results lines. The results lines are in the
50   * expected order.
51   * 
52   * param spec line: a comma-separated list of params to FuzzyQuery
53   *   (query, prefixLen, pqSize, minScore)
54   * query = query text as a number (expand with Integer.toBinaryString)
55   * prefixLen = prefix length
56   * pqSize = priority queue maximum size for TopTermsBoostOnlyBooleanQueryRewrite
57   * minScore = minimum similarity
58   * 
59   * resultCount line: total number of expected hits.
60   * 
61   * results line: comma-separated docID, score pair
62   **/
63  public class TestSlowFuzzyQuery2 extends LuceneTestCase {
64    /** epsilon for score comparisons */
65    static final float epsilon = 0.00001f;
66  
67    static int[][] mappings = new int[][] {
68      new int[] { 0x40, 0x41 },
69      new int[] { 0x40, 0x0195 },
70      new int[] { 0x40, 0x0906 },
71      new int[] { 0x40, 0x1040F },
72      new int[] { 0x0194, 0x0195 },
73      new int[] { 0x0194, 0x0906 },
74      new int[] { 0x0194, 0x1040F },
75      new int[] { 0x0905, 0x0906 },
76      new int[] { 0x0905, 0x1040F },
77      new int[] { 0x1040E, 0x1040F }
78    };
79    public void testFromTestData() throws Exception {
80      // TODO: randomize!
81      assertFromTestData(mappings[random().nextInt(mappings.length)]);
82    }
83  
84    public void assertFromTestData(int codePointTable[]) throws Exception {
85      if (VERBOSE) {
86        System.out.println("TEST: codePointTable=" + codePointTable);
87      }
88      InputStream stream = getClass().getResourceAsStream("fuzzyTestData.txt");
89      BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
90      
91      int bits = Integer.parseInt(reader.readLine());
92      int terms = (int) Math.pow(2, bits);
93      
94      Directory dir = newDirectory();
95      Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
96      RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
97  
98      Document doc = new Document();
99      Field field = newTextField("field", "", Field.Store.NO);
100     doc.add(field);
101     
102     for (int i = 0; i < terms; i++) {
103       field.setStringValue(mapInt(codePointTable, i));
104       writer.addDocument(doc);
105     }   
106     
107     IndexReader r = writer.getReader();
108     IndexSearcher searcher = newSearcher(r);
109     if (VERBOSE) {
110       System.out.println("TEST: searcher=" + searcher);
111     }
112     // even though this uses a boost-only rewrite, this test relies upon queryNorm being the default implementation,
113     // otherwise scores are different!
114     searcher.setSimilarity(new DefaultSimilarity());
115     
116     writer.close();
117     String line;
118     while ((line = reader.readLine()) != null) {
119       String params[] = line.split(",");
120       String query = mapInt(codePointTable, Integer.parseInt(params[0]));
121       int prefix = Integer.parseInt(params[1]);
122       int pqSize = Integer.parseInt(params[2]);
123       float minScore = Float.parseFloat(params[3]);
124       SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", query), minScore, prefix);
125       q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize));
126       int expectedResults = Integer.parseInt(reader.readLine());
127       TopDocs docs = searcher.search(q, expectedResults);
128       assertEquals(expectedResults, docs.totalHits);
129       for (int i = 0; i < expectedResults; i++) {
130         String scoreDoc[] = reader.readLine().split(",");
131         assertEquals(Integer.parseInt(scoreDoc[0]), docs.scoreDocs[i].doc);
132         assertEquals(Float.parseFloat(scoreDoc[1]), docs.scoreDocs[i].score, epsilon);
133       }
134     }
135     IOUtils.close(r, dir, analyzer);
136   }
137   
138   /* map bits to unicode codepoints */
139   private static String mapInt(int codePointTable[], int i) {
140     StringBuilder sb = new StringBuilder();
141     String binary = Integer.toBinaryString(i);
142     for (int j = 0; j < binary.length(); j++)
143       sb.appendCodePoint(codePointTable[binary.charAt(j) - '0']);
144     return sb.toString();
145   }
146 
147   /* Code to generate test data
148   public static void main(String args[]) throws Exception {
149     int bits = 3;
150     System.out.println(bits);
151     int terms = (int) Math.pow(2, bits);
152     
153     RAMDirectory dir = new RAMDirectory();
154     IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(),
155         IndexWriter.MaxFieldLength.UNLIMITED);
156     
157     Document doc = new Document();
158     Field field = newField("field", "", Field.Store.NO, Field.Index.ANALYZED);
159     doc.add(field);
160 
161     for (int i = 0; i < terms; i++) {
162       field.setValue(Integer.toBinaryString(i));
163       writer.addDocument(doc);
164     }
165     
166     writer.forceMerge(1);
167     writer.close();
168 
169     IndexSearcher searcher = new IndexSearcher(dir);
170     for (int prefix = 0; prefix < bits; prefix++)
171       for (int pqsize = 1; pqsize <= terms; pqsize++)
172         for (float minscore = 0.1F; minscore < 1F; minscore += 0.2F)
173           for (int query = 0; query < terms; query++) {
174             FuzzyQuery q = new FuzzyQuery(
175                 new Term("field", Integer.toBinaryString(query)), minscore, prefix);
176             q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqsize));
177             System.out.println(query + "," + prefix + "," + pqsize + "," + minscore);
178             TopDocs docs = searcher.search(q, terms);
179             System.out.println(docs.totalHits);
180             for (int i = 0; i < docs.totalHits; i++)
181               System.out.println(docs.scoreDocs[i].doc + "," + docs.scoreDocs[i].score);
182           }
183   }
184   */
185 }